Nishanth Alladi (nalladi2), Ethan Cook (elcook3), Davy Ji (davyji2)
Each member contributed to several pair programming sessions where the entirety of the report was written. Each member contributed to both theorizing solutions and in implementing code.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
random.seed(3970)
myData = pd.read_csv("Coding2_Data.csv")
var_names = myData.columns
y = myData[['Y']].to_numpy()
X = myData.drop(['Y'], axis = 1).to_numpy()
X.shape, len(y)
((506, 13), 506)
def one_var_lasso(r, x, lam):
a = (r.T @ x) / np.square(np.linalg.norm(x))
n = 2 * r.size * lam / np.square(np.linalg.norm(x))
if a > n/2:
return a - n/2
elif abs(a) <= n/2:
return 0
else:
return a + n/2
def MyLasso(X, y, lam_seq, maxit = 100):
# Input
# X: n-by-p design matrix without the intercept
# y: n-by-1 response vector
# lam.seq: sequence of lambda values (arranged from large to small)
# maxit: number of updates for each lambda
# Output
# B: a (p+1)-by-len(lam.seq) coefficient matrix
# with the first row being the intercept sequence
n, p = X.shape
nlam = len(lam_seq)
B = np.zeros((p+1, nlam))
##############################
# YOUR CODE:
# (1) newX = Standardizad X;
# (2) Record the centers and scales used in (1)
##############################
centers = X.mean(0)
scales = X.std(0)
newX = (X - centers) / scales
# Initilize coef vector b and residual vector r
b = np.zeros(p)
r = y - y.mean(0)
# Triple nested loop
for m in range(nlam):
for step in range(maxit):
for j in range(p):
X_j = newX[:, j].reshape(-1,1)
r = r + X_j * b[j]
b[j] = one_var_lasso(r, X_j, lam_seq[m])
r = r - X_j * b[j]
B[1:, m] = b
##############################
# YOUR CODE:
# Scale back the coefficients;
# Update the intercepts stored in B[, 1] (shouldn't it be B[0, ] ???!!!)
##############################
B[1:,] = B[1:,] / scales[:, np.newaxis]
A = np.mean(y) - B[1:,].T @ centers
B[0, ] = A
# where are the intercepts?
return(B)
log_lam_seq = np.linspace(-1, -8, num = 80)
lam_seq = np.exp(log_lam_seq)
myout = MyLasso(X, y, lam_seq, maxit = 100)
p, _ = myout.shape
plt.figure(figsize = (12,8))
for i in range(p-1):
plt.plot(log_lam_seq, myout[i+1, :], label = var_names[i])
plt.xlabel('Log Lambda')
plt.ylabel('Coefficients')
plt.title('Lasso Paths - Numpy implementation')
plt.legend()
plt.axis('tight')
(-8.35, -0.6499999999999999, -0.3099945835128368, 0.4997421988480716)
lasso_coef = pd.read_csv("Coding2_lasso_coefs.csv").to_numpy()
lasso_coef.shape
(14, 80)
abs(myout - lasso_coef).max()
0.004645317416207995
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression as lm
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
url = "https://raw.githubusercontent.com/liangfgithub/liangfgithub.github.io/master/Data/Coding2_Data2.csv"
myData = pd.read_csv(url)
# myData.head()
Y = myData['Y']
X = myData.drop(['Y'], axis = 1)
X.shape, len(Y)
((506, 91), 506)
n = len(Y)
indices = np.arange(0, n)
np.random.shuffle(indices)
test_ind = indices[:int(np.floor(0.25*n))]
train_ind = indices[len(test_ind):]
# Splitting the data into training and testing sets
X_train = X.iloc[train_ind]
Y_train = Y[train_ind]
X_test = X.iloc[test_ind]
Y_test = Y[test_ind]
full = lm().fit(X_train, Y_train)
mean_squared_error(Y_test, full.predict(X_test))
0.026016949802181194
import warnings
warnings.filterwarnings("ignore")
ridge_alphas = np.logspace(-10, 1, 100)
ridgecv = RidgeCV(alphas = ridge_alphas, cv = 10,
scoring = 'neg_mean_squared_error',
normalize = True)
ridgecv.fit(X_train, Y_train)
ridgecv.alpha_
0.00278255940220712
ridge_model = Ridge(alpha = ridgecv.alpha_, normalize = True)
ridge_model.fit(X_train, Y_train)
mean_squared_error(Y_test, ridge_model.predict(X_test))
0.027840140206646286
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
lasso_alphas = np.logspace(-10, 1, 100)
lassocv = LassoCV(alphas = lasso_alphas, cv = 10,
normalize = True)
lassocv.fit(X_train, Y_train)
lassocv.alpha_
1.2915496650148827e-05
mean_mse = np.mean(lassocv.mse_path_, axis=1)
std_mse = np.std(lassocv.mse_path_, axis=1) / np.sqrt(10)
cv_alphas = lassocv.alphas_
min_idx = np.argmin(mean_mse)
alpha_min = cv_alphas[min_idx]
threshold = mean_mse[min_idx] + std_mse[min_idx]
alpha_1se = max(cv_alphas[np.where(mean_mse <= threshold)])
alpha_min, alpha_1se #alpha_min = lassocv.alpha_
(1.2915496650148827e-05, 7.742636826811262e-05)
lasso_model_min = Lasso(alpha = alpha_min, normalize = True, max_iter=10000)
lasso_model_min.fit(X_train, Y_train)
mean_squared_error(Y_test, lasso_model_min.predict(X_test))
0.027654957251775508
lasso_model_1se = Lasso(alpha = alpha_1se, normalize = True, max_iter=10000)
lasso_model_1se.fit(X_train, Y_train)
mean_squared_error(Y_test, lasso_model_1se.predict(X_test))
0.034435035735162337
nonzero_indices = np.where(lasso_model_1se.coef_ != 0)[0]
lm_refit = lm()
lm_refit.fit(X_train.iloc[:, nonzero_indices], Y_train)
mean_squared_error(Y_test, lm_refit.predict(X_test.iloc[:, nonzero_indices]))
0.030269297621497604
class PCR(object):
def __init__(self, num_folds=10):
self.folds = num_folds
def fit(self, X, Y):
n, p = X.shape
indices = np.arange(n)
np.random.shuffle(indices)
index_sets = np.array_split(indices, self.folds)
ncomp = min(p, n - 1 - max([len(i) for i in index_sets]))
cv_err = np.zeros(ncomp)
for ifold in range(self.folds):
train_inds = np.delete(index_sets, obj=ifold, axis=0).ravel()
test_inds = index_sets[ifold]
X_train = X[train_inds, :]
pipeline = Pipeline([('scaling', StandardScaler()), ('pca', PCA())])
pipeline.fit(X_train)
X_train = pipeline.transform(X_train)
coefs = Y[train_inds].T @ X_train / np.sum(X_train**2, axis=0)
b0 = np.mean(Y[train_inds])
X_test = pipeline.transform(X[test_inds, :])
for k in np.arange(ncomp):
preds = X_test[:, :k] @ coefs.T[:k] + b0
cv_err[k] += cv_err[k] + np.sum((Y[test_inds]-preds)**2)
min_ind = np.argmin(cv_err)
self.ncomp = min_ind+1
pipeline = Pipeline([('scaling', StandardScaler()), ('pca', PCA(n_components=self.ncomp))])
self.transform = pipeline.fit(X)
self.model = lm().fit(self.transform.transform(X), Y)
def predict(self, X):
X_ = self.transform.transform(X)
return self.model.predict(X_)
pcr = PCR()
pcr.fit(X_train.to_numpy(), Y_train.to_numpy())
mean_squared_error(Y_test, pcr.predict(X_test.to_numpy()))
0.02865171357952127
n = len(Y)
mspes = [[], [], [], [], [], []]
for i in range(50):
indices = np.arange(0, n)
np.random.shuffle(indices)
test_ind = indices[:int(np.floor(0.25*n))]
train_ind = indices[len(test_ind):]
# Splitting the data into training and testing sets
X_train = X.iloc[train_ind]
Y_train = Y[train_ind]
X_test = X.iloc[test_ind]
Y_test = Y[test_ind]
# Full
full = lm().fit(X_train, Y_train)
mspes[0].append(mean_squared_error(Y_test, full.predict(X_test)))
# Ridge
ridge_alphas = np.logspace(-10, 1, 100)
ridgecv = RidgeCV(alphas = ridge_alphas, cv = 10, scoring = 'neg_mean_squared_error', normalize = True)
ridgecv.fit(X_train, Y_train)
ridgecv.alpha_
ridge_model = Ridge(alpha = ridgecv.alpha_, normalize = True)
ridge_model.fit(X_train, Y_train)
mspes[1].append(mean_squared_error(Y_test, ridge_model.predict(X_test)))
# Lasso prep
lasso_alphas = np.logspace(-10, 1, 100)
lassocv = LassoCV(alphas = lasso_alphas, cv = 10, normalize = True)
lassocv.fit(X_train, Y_train)
lassocv.alpha_
mean_mse = np.mean(lassocv.mse_path_, axis=1)
std_mse = np.std(lassocv.mse_path_, axis=1) / np.sqrt(10)
cv_alphas = lassocv.alphas_
min_idx = np.argmin(mean_mse)
alpha_min = cv_alphas[min_idx]
threshold = mean_mse[min_idx] + std_mse[min_idx]
alpha_1se = max(cv_alphas[np.where(mean_mse <= threshold)])
alpha_min, alpha_1se #alpha_min = lassocv.alpha_
# Lasso min
lasso_model_min = Lasso(alpha = alpha_min, normalize = True, max_iter=10000)
lasso_model_min.fit(X_train, Y_train)
mspes[2].append(mean_squared_error(Y_test, lasso_model_min.predict(X_test)))
# Lasso 1se
lasso_model_1se = Lasso(alpha = alpha_1se, normalize = True, max_iter=10000)
lasso_model_1se.fit(X_train, Y_train)
mspes[3].append(mean_squared_error(Y_test, lasso_model_1se.predict(X_test)))
# L refit
nonzero_indices = np.where(lasso_model_1se.coef_ != 0)[0]
lm_refit = lm()
lm_refit.fit(X_train.iloc[:, nonzero_indices], Y_train)
mspes[4].append(mean_squared_error(Y_test, lm_refit.predict(X_test.iloc[:, nonzero_indices])))
pcr = PCR()
pcr.fit(X_train.to_numpy(), Y_train.to_numpy())
mspes[5].append(mean_squared_error(Y_test, pcr.predict(X_test.to_numpy())))
mspes_df = pd.DataFrame(mspes, index = ['Full', 'Ridge', 'Lasso min', 'Lasso 1se', 'L refit', 'PCR']).transpose()
import plotly.io as pio
pio.renderers.default = 'notebook'
import plotly.express as px
fig = px.box(mspes_df, points = 'all', labels = {'value': 'MSPE', 'variable': 'Method'})
fig.show()
Which procedure or procedures yield the best performance in terms of MSPE? Ridge regression seems to yield the best performance in terms of MSPE
Conversely, which procedure or procedures show the poorest performance? L refit seems to show the poorest performance
In the context of Lasso regression, which procedure, Lasso.min or Lasso.1se, yields a better MSPE? Lasso.min yields a smaller median MSPE, but has more variation than Lasso.1se
Is refitting advantageous in this case? In other words, does L.Refit outperform Lasso.1se? No
Is variable selection or shrinkage warranted for this particular dataset? To clarify, do you find the performance of the Full model to be comparable to, or divergent from, the best-performing procedure among the other five? With an eye test, the Full model seems to be noticeably worse than the Ridge regression
url = "https://raw.githubusercontent.com/liangfgithub/liangfgithub.github.io/master/Data/Coding2_Data3.csv"
myData = pd.read_csv(url)
# myData.head()
Y = myData['Y']
X = myData.drop(['Y'], axis = 1)
n = len(Y)
mspes2 = [[], [], [], [], []]
for i in range(50):
indices = np.arange(0, n)
np.random.shuffle(indices)
test_ind = indices[:int(np.floor(0.25*n))]
train_ind = indices[len(test_ind):]
# Splitting the data into training and testing sets
X_train = X.iloc[train_ind]
Y_train = Y[train_ind]
X_test = X.iloc[test_ind]
Y_test = Y[test_ind]
# Ridge
ridge_alphas = np.logspace(-10, 1, 100)
ridgecv = RidgeCV(alphas = ridge_alphas, cv = 10, scoring = 'neg_mean_squared_error', normalize = True)
ridgecv.fit(X_train, Y_train)
ridgecv.alpha_
ridge_model = Ridge(alpha = ridgecv.alpha_, normalize = True)
ridge_model.fit(X_train, Y_train)
mspes2[0].append(mean_squared_error(Y_test, ridge_model.predict(X_test)))
# Lasso prep
lasso_alphas = np.logspace(-10, 1, 100)
lassocv = LassoCV(alphas = lasso_alphas, cv = 10, normalize = True)
lassocv.fit(X_train, Y_train)
lassocv.alpha_
mean_mse = np.mean(lassocv.mse_path_, axis=1)
std_mse = np.std(lassocv.mse_path_, axis=1) / np.sqrt(10)
cv_alphas = lassocv.alphas_
min_idx = np.argmin(mean_mse)
alpha_min = cv_alphas[min_idx]
threshold = mean_mse[min_idx] + std_mse[min_idx]
alpha_1se = max(cv_alphas[np.where(mean_mse <= threshold)])
alpha_min, alpha_1se #alpha_min = lassocv.alpha_
# Lasso min
lasso_model_min = Lasso(alpha = alpha_min, normalize = True, max_iter=10000)
lasso_model_min.fit(X_train, Y_train)
mspes2[1].append(mean_squared_error(Y_test, lasso_model_min.predict(X_test)))
# Lasso 1se
lasso_model_1se = Lasso(alpha = alpha_1se, normalize = True, max_iter=10000)
lasso_model_1se.fit(X_train, Y_train)
mspes2[2].append(mean_squared_error(Y_test, lasso_model_1se.predict(X_test)))
# L refit
nonzero_indices = np.where(lasso_model_1se.coef_ != 0)[0]
lm_refit = lm()
lm_refit.fit(X_train.iloc[:, nonzero_indices], Y_train)
mspes2[3].append(mean_squared_error(Y_test, lm_refit.predict(X_test.iloc[:, nonzero_indices])))
pcr = PCR()
pcr.fit(X_train.to_numpy(), Y_train.to_numpy())
mspes2[4].append(mean_squared_error(Y_test, pcr.predict(X_test.to_numpy())))
mspes2_df = pd.DataFrame(mspes2, index = ['Ridge', 'Lasso min', 'Lasso 1se', 'L refit', 'PCR']).transpose()
import plotly.express as px
fig = px.box(mspes2_df, points = 'all', labels = {'value': 'MSPE', 'variable': 'Method'})
fig.show()
Which procedure or procedures yield the best performance in terms of MSPE? Lasso min seems to yield the best performance in terms of MSPE
Conversely, which procedure or procedures show the poorest performance? PCR seems to show the poorest performance
Have you observed any procedure or procedures that performed well in Case I but exhibited poorer performance in Case II, or vice versa? If so, please offer an explanation. Ridge and PCR perform much worse in Case II, and L refit seems to do better in Case II.
Given that Coding2_Data3.csv includes all features found in Coding2_Data2.csv, one might anticipate that the best MSPE in Case II would be equal to or lower than the best MSPE in Case I. Do your simulation results corroborate this expectation? If not, please offer an explanation. Our results do not corroborate this expectation, and all methods peform worse in terms of MSPE in Case II. We think this is because Case II has much more columns that are purely noise than Case I, meaning that much more of the test data would be noisy, not contributing to the models. We originally expected Lasso to perform the same in Case I and II, but realized that there is no guarantee that Lasso would make the noisy predictor variables coefficients of 0, hence contributing to the increased MSPE. We would expect that as lambda increased, we would see the noisy predictors becoming less and less relevant -- this doesn't necessarily mean that MSPE would go down, since increasing lambda would also make the real predictors less relevant.